From f30d6c75652c95152eb0dbe6bf9da2198a780a84 Mon Sep 17 00:00:00 2001
From: Timothy Pearson <tpearson@raptorengineeringinc.com>
Date: Tue, 28 Jul 2015 15:16:46 -0500
Subject: [PATCH 088/143] northbridge/amd/amdmct/mct_ddr3: Add registered and
 x4 DIMM support to Fam15h

Change-Id: I9ee0bb7346aa35f564fe535cdd337ec7f6148f2b
Signed-off-by: Timothy Pearson <tpearson@raptorengineeringinc.com>
---
 src/northbridge/amd/amdmct/mct_ddr3/mct_d.c    |  186 ++++++-----
 src/northbridge/amd/amdmct/mct_ddr3/mct_d.h    |    2 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c |    4 +
 src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c   |   17 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctrci.c   |  191 +++++++----
 src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c   |   42 ++-
 src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c   |  253 ++++++++-------
 src/northbridge/amd/amdmct/mct_ddr3/mctwl.c    |   16 +-
 src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c  |  400 +++++++++++++++---------
 src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h   |   13 +-
 10 files changed, 698 insertions(+), 426 deletions(-)

diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
index b29ff3c..1c9c568 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
@@ -166,7 +166,7 @@ static void mct_EnDllShutdownSR(struct MCTStatStruc *pMCTstat,
 static void ChangeMemClk(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat);
 void SetTargetFreq(struct MCTStatStruc *pMCTstat,
-                                        struct DCTStatStruc *pDCTstat);
+                                        struct DCTStatStruc *pDCTstatA, uint8_t Node);
 
 static u32 mct_MR1Odt_RDimm(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct, u32 MrsChipSel);
@@ -1404,6 +1404,10 @@ static void precise_memclk_delay_fam15(struct MCTStatStruc *pMCTstat, struct DCT
 
 	memclk_freq = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x94) & 0x1f;
 
+	if (fam15h_freq_tab[memclk_freq] == 0) {
+		printk(BIOS_DEBUG, "ERROR: precise_memclk_delay_fam15 for DCT %d (delay %d clocks) failed to obtain valid memory frequency!"
+			" (pDCTstat: %p pDCTstat->dev_dct: %08x memclk_freq: %02x)\n", dct, clocks, pDCTstat, pDCTstat->dev_dct, memclk_freq);
+	}
 	delay_ns = (((uint64_t)clocks * 1000) / fam15h_freq_tab[memclk_freq]);
 	precise_ndelay_fam15(pMCTstat, delay_ns);
 }
@@ -2320,7 +2324,7 @@ static void DQSTiming_D(struct MCTStatStruc *pMCTstat,
 	nv_DQSTrainCTL = !allow_config_restore;
 
 	mct_BeforeDQSTrain_D(pMCTstat, pDCTstatA);
-	phyAssistedMemFnceTraining(pMCTstat, pDCTstatA);
+	phyAssistedMemFnceTraining(pMCTstat, pDCTstatA, -1);
 
 	if (is_fam15h()) {
 		uint8_t Node;
@@ -3359,7 +3363,7 @@ static void SPD2ndTiming(struct MCTStatStruc *pMCTstat,
 }
 
 static u8 AutoCycTiming_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat, u8 dct)
+				struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	/* Initialize  DCT Timing registers as per DIMM SPD.
 	 * For primary timing (T, CL) use best case T value.
@@ -3463,7 +3467,7 @@ static void GetPresetmaxF_D(struct MCTStatStruc *pMCTstat,
 }
 
 static void SPDGetTCL_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat, u8 dct)
+				struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	/* Find the best T and CL primary timing parameter pair, per Mfg.,
 	 * for the given set of DIMMs, and store into DCTStatStruc
@@ -3742,10 +3746,15 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 		dword++;
 	}
 
-	if (Status & (1 << SB_Registered))
-		DramConfigLo |= 1 << ParEn;		/* Registered DIMMs */
-	else
-		DramConfigLo |= 1 << UnBuffDimm;	/* Unbuffered DIMMs */
+	if (Status & (1 << SB_Registered)) {
+		/* Registered DIMMs */
+		if (!is_fam15h()) {
+			DramConfigLo |= 1 << ParEn;
+		}
+	} else {
+		/* Unbuffered DIMMs */
+		DramConfigLo |= 1 << UnBuffDimm;
+	}
 
 	if (mctGet_NVbits(NV_ECC_CAP))
 		if (Status & (1 << SB_ECCDIMMs))
@@ -3763,10 +3772,11 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 	DramConfigHi |= dword - offset;	/* get MemClk encoding */
 	DramConfigHi |= 1 << MemClkFreqVal;
 
-	if (Status & (1 << SB_Registered))
-		if ((pDCTstat->Dimmx4Present != 0) && (pDCTstat->Dimmx8Present != 0))
-			/* set only if x8 Registered DIMMs in System*/
-			DramConfigHi |= 1 << RDqsEn;
+	if (!is_fam15h())
+		if (Status & (1 << SB_Registered))
+			if ((pDCTstat->Dimmx4Present != 0) && (pDCTstat->Dimmx8Present != 0))
+				/* set only if x8 Registered DIMMs in System*/
+				DramConfigHi |= 1 << RDqsEn;
 
 	if (pDCTstat->LogicalCPUID & AMD_FAM15_ALL) {
 		DramConfigLo |= 1 << 25;	/* PendRefPaybackS3En = 1 */
@@ -3778,14 +3788,16 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 			DramConfigHi |= 1 << 16;
 	}
 
-	/* Control Bank Swizzle */
-	if (0) /* call back not needed mctBankSwizzleControl_D()) */
-		DramConfigHi &= ~(1 << BankSwizzleMode);
-	else
-		DramConfigHi |= 1 << BankSwizzleMode; /* recommended setting (default) */
+	if (!is_fam15h()) {
+		/* Control Bank Swizzle */
+		if (0) /* call back not needed mctBankSwizzleControl_D()) */
+			DramConfigHi &= ~(1 << BankSwizzleMode);
+		else
+			DramConfigHi |= 1 << BankSwizzleMode; /* recommended setting (default) */
+	}
 
 	/* Check for Quadrank DIMM presence */
-	if ( pDCTstat->DimmQRPresent != 0) {
+	if (pDCTstat->DimmQRPresent != 0) {
 		byte = mctGet_NVbits(NV_4RANKType);
 		if (byte == 2)
 			DramConfigHi |= 1 << 17;	/* S4 (4-Rank SO-DIMMs) */
@@ -4590,8 +4602,9 @@ static u8 mct_setMode(struct MCTStatStruc *pMCTstat,
 			Set_NB32(pDCTstat->dev_dct, reg, val);
 		}
 		if (byte)	/* NV_Unganged */
-			pDCTstat->ErrStatus &= ~(1 << SB_DimmMismatchO); /* Clear so that there is no DIMM missmatch error */
+			pDCTstat->ErrStatus &= ~(1 << SB_DimmMismatchO); /* Clear so that there is no DIMM mismatch error */
 	}
+
 	return pDCTstat->ErrCode;
 }
 
@@ -4652,6 +4665,8 @@ void Set_NB32_index_wait(u32 dev, u32 index_reg, u32 index, u32 data)
 static u8 mct_BeforePlatformSpec(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct)
 {
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	/* mct_checkForCxDxSupport_D */
 	if (pDCTstat->LogicalCPUID & AMD_DR_GT_Bx) {
 		/* Family 10h Errata 322: Address and Command Fine Delay Values May Be Incorrect */
@@ -4666,6 +4681,9 @@ static u8 mct_BeforePlatformSpec(struct MCTStatStruc *pMCTstat,
 		else
 			Set_NB32_index_wait_DCT(pDCTstat->dev_dct, dct, 0x98, 0x0D02E001, 0x90);
 	}
+
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
+
 	return pDCTstat->ErrCode;
 }
 
@@ -4676,6 +4694,8 @@ static u8 mct_PlatformSpec(struct MCTStatStruc *pMCTstat,
 	 * and program them into DCT.
 	 */
 
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	u32 dev = pDCTstat->dev_dct;
 	u32 index_reg;
 	u8 i, i_start, i_end;
@@ -4696,6 +4716,8 @@ static u8 mct_PlatformSpec(struct MCTStatStruc *pMCTstat,
 		printk(BIOS_SPEW, "Programmed DCT %d timing/termination pattern %08x %08x\n", dct, pDCTstat->CH_ADDR_TMG[i], pDCTstat->CH_ODC_CTL[i]);
 	}
 
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
+
 	return pDCTstat->ErrCode;
 }
 
@@ -4707,7 +4729,8 @@ static void mct_SyncDCTsReady(struct DCTStatStruc *pDCTstat)
 	if (pDCTstat->NodePresent) {
 		dev = pDCTstat->dev_dct;
 
-		if ((pDCTstat->DIMMValidDCT[0] ) || (pDCTstat->DIMMValidDCT[1])) {		/* This Node has dram */
+		if ((pDCTstat->DIMMValidDCT[0]) || (pDCTstat->DIMMValidDCT[1])) {
+			/* This Node has DRAM */
 			do {
 				val = Get_NB32(dev, 0x110);
 			} while (!(val & (1 << DramEnabled)));
@@ -5655,57 +5678,56 @@ static void InitDDRPhy(struct MCTStatStruc *pMCTstat,
 	/* Fam15h BKDG v3.14 section 2.10.5.3
 	 * The remainder of the Phy Initialization algorithm picks up in phyAssistedMemFnceTraining
 	 */
-	for (dct = 0; dct < 2; dct++) {
-		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0000000b, 0x80000000);
-		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe013, 0x00000118);
-
-		/* Program desired VDDIO level */
-		if (ddr_voltage_index & 0x4) {
-			/* 1.25V */
-			amd_voltage_level_index = 0x2;
-		} else if (ddr_voltage_index & 0x2) {
-			/* 1.35V */
-			amd_voltage_level_index = 0x1;
-		} else if (ddr_voltage_index & 0x1) {
-			/* 1.50V */
-			amd_voltage_level_index = 0x0;
-		}
-
-		/* D18F2x9C_x0D0F_0[F,8:0]1F_dct[1:0][RxVioLvl] */
-		for (index = 0; index < 0x9; index++) {
-			dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f001f | (index << 8));
-			dword &= ~(0x3 << 3);
-			dword |= (amd_voltage_level_index << 3);
-			Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f001f | (index << 8), dword);
-		}
-
-		/* D18F2x9C_x0D0F_[C,8,2][2:0]1F_dct[1:0][RxVioLvl] */
-		for (index = 0; index < 0x3; index++) {
-			dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f201f | (index << 8));
-			dword &= ~(0x3 << 3);
-			dword |= (amd_voltage_level_index << 3);
-			Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f201f | (index << 8), dword);
-		}
-		for (index = 0; index < 0x2; index++) {
-			dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f801f | (index << 8));
-			dword &= ~(0x3 << 3);
-			dword |= (amd_voltage_level_index << 3);
-			Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f801f | (index << 8), dword);
-		}
-		for (index = 0; index < 0x1; index++) {
-			dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fc01f | (index << 8));
-			dword &= ~(0x3 << 3);
-			dword |= (amd_voltage_level_index << 3);
-			Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fc01f | (index << 8), dword);
-		}
+	Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0000000b, 0x80000000);
+	Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe013, 0x00000118);
 
-		/* D18F2x9C_x0D0F_4009_dct[1:0][CmpVioLvl, ComparatorAdjust] */
-		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f4009);
-		dword &= ~(0x0000c00c);
-		dword |= (amd_voltage_level_index << 14);
-		dword |= (amd_voltage_level_index << 2);
-		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f4009, dword);
-	}
+	/* Program desired VDDIO level */
+	if (ddr_voltage_index & 0x4) {
+		/* 1.25V */
+		amd_voltage_level_index = 0x2;
+	} else if (ddr_voltage_index & 0x2) {
+		/* 1.35V */
+		amd_voltage_level_index = 0x1;
+	} else if (ddr_voltage_index & 0x1) {
+		/* 1.50V */
+		amd_voltage_level_index = 0x0;
+	}
+
+	/* D18F2x9C_x0D0F_0[F,8:0]1F_dct[1:0][RxVioLvl] */
+	for (index = 0; index < 0x9; index++) {
+		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f001f | (index << 8));
+		dword &= ~(0x3 << 3);
+		dword |= (amd_voltage_level_index << 3);
+		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f001f | (index << 8), dword);
+	}
+
+	/* D18F2x9C_x0D0F_[C,8,2][2:0]1F_dct[1:0][RxVioLvl] */
+	for (index = 0; index < 0x3; index++) {
+		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f201f | (index << 8));
+		dword &= ~(0x3 << 3);
+		dword |= (amd_voltage_level_index << 3);
+		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f201f | (index << 8), dword);
+	}
+	for (index = 0; index < 0x2; index++) {
+		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f801f | (index << 8));
+		dword &= ~(0x3 << 3);
+		dword |= (amd_voltage_level_index << 3);
+		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0f801f | (index << 8), dword);
+	}
+	for (index = 0; index < 0x1; index++) {
+		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fc01f | (index << 8));
+		dword &= ~(0x3 << 3);
+		dword |= (amd_voltage_level_index << 3);
+		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fc01f | (index << 8), dword);
+	}
+
+	/* D18F2x9C_x0D0F_4009_dct[1:0][CmpVioLvl, ComparatorAdjust] */
+	/* NOTE: CmpVioLvl and ComparatorAdjust only take effect when set on DCT 0 */
+	dword = Get_NB32_index_wait_DCT(dev, 0, index_reg, 0x0d0f4009);
+	dword &= ~(0x0000c00c);
+	dword |= (amd_voltage_level_index << 14);
+	dword |= (amd_voltage_level_index << 2);
+	Set_NB32_index_wait_DCT(dev, 0, index_reg, 0x0d0f4009, dword);
 
 	printk(BIOS_DEBUG, "%s: Done\n", __func__);
 }
@@ -5721,18 +5743,24 @@ static void InitPhyCompensation(struct MCTStatStruc *pMCTstat,
 	uint32_t dword;
 	const u8 *p;
 
-	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+	printk(BIOS_DEBUG, "%s: DCT %d: Start\n", __func__, dct);
 
 	if (is_fam15h()) {
 		/* Algorithm detailed in the Fam15h BKDG Rev. 3.14 section 2.10.5.3.4 */
 		uint32_t tx_pre;
 		uint32_t drive_strength;
 
-		/* Program D18F2x9C_x0D0F_E003_dct[1:0][DisAutoComp, DisablePredriverCal] */
+		/* Program D18F2x9C_x0D0F_E003_dct[1:0][DisAutoComp] */
 		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe003);
-		dword |= (0x3 << 13);
+		dword |= (0x1 << 14);
 		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe003, dword);
 
+		/* Program D18F2x9C_x0D0F_E003_dct[1:0][DisablePredriverCal] */
+		/* NOTE: DisablePredriverCal only takes effect when set on DCT 0 */
+		dword = Get_NB32_index_wait_DCT(dev, 0, index_reg, 0x0d0fe003);
+		dword |= (0x1 << 13);
+		Set_NB32_index_wait_DCT(dev, 0, index_reg, 0x0d0fe003, dword);
+
 		/* Determine TxPreP/TxPreN for data lanes (Stage 1) */
 		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x00000000);
 		drive_strength = (dword >> 20) & 0x7;	/* DqsDrvStren */
@@ -5878,12 +5906,14 @@ static void InitPhyCompensation(struct MCTStatStruc *pMCTstat,
 		Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x0a, dword);
 	}
 
-	printk(BIOS_DEBUG, "%s: Done\n", __func__);
+	printk(BIOS_DEBUG, "%s: DCT %d: Done\n", __func__, dct);
 }
 
 static void mct_EarlyArbEn_D(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct)
 {
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	if (!is_fam15h()) {
 		u32 reg;
 		u32 val;
@@ -5905,6 +5935,8 @@ static void mct_EarlyArbEn_D(struct MCTStatStruc *pMCTstat,
 
 		Set_NB32_DCT(dev, dct, reg, val);
 	}
+
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
 }
 
 static u8 CheckNBCOFEarlyArbEn(struct MCTStatStruc *pMCTstat,
@@ -6548,6 +6580,8 @@ void mct_SetDramConfigHi_D(struct MCTStatStruc *pMCTstat,
 
 	uint32_t dword;
 
+	printk(BIOS_DEBUG, "%s: Start\n", __func__);
+
 	if (is_fam15h()) {
 		/* Initial setup for frequency change
 		 * 9C_x0000_0004 must be configured before MemClkFreqVal is set
@@ -6580,6 +6614,8 @@ void mct_SetDramConfigHi_D(struct MCTStatStruc *pMCTstat,
 		mct_Wait(100);
 	}
 
+	printk(BIOS_DEBUG, "mct_SetDramConfigHi_D: DramConfigHi:    %08x\n", DramConfigHi);
+
 	/* Program the DRAM Configuration High register */
 	Set_NB32_DCT(dev, dct, 0x94, DramConfigHi);
 
@@ -6595,6 +6631,8 @@ void mct_SetDramConfigHi_D(struct MCTStatStruc *pMCTstat,
 		dword |= 0x0000000f;
 		Set_NB32_index_wait_DCT(pDCTstat->dev_dct, dct, index_reg, 0x0d0fe006, dword);
 	}
+
+	printk(BIOS_DEBUG, "%s: Done\n", __func__);
 }
 
 static void mct_BeforeDQSTrain_D(struct MCTStatStruc *pMCTstat,
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
index e327d38..486b16c 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
@@ -1014,7 +1014,7 @@ void InterleaveNodes_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTs
 void InterleaveChannels_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 void mct_BeforeDQSTrain_Samp_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat);
 
-void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
+void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA, int16_t Node);
 u8 mct_SaveRcvEnDly_D_1Pass(struct DCTStatStruc *pDCTstat, u8 pass);
 u8 mct_InitReceiver_D(struct DCTStatStruc *pDCTstat, u8 dct);
 void mct_Wait(u32 cycles);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
index 36e9858..c70fa6d 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
@@ -1588,6 +1588,7 @@ static void TrainDQSReceiverEnCyc_D_Fam15(struct MCTStatStruc *pMCTstat,
 
 	for (dct = 0; dct < 2; dct++) {
 		/* Program D18F2x9C_x0D0F_E003_dct[1:0][DisAutoComp, DisablePredriverCal] */
+		/* NOTE: DisablePredriverCal only takes effect when set on DCT 0 */
 		dword = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x0d0fe003);
 		dword &= ~(0x3 << 13);
 		dword |= (0x1 << 13);
@@ -1627,6 +1628,9 @@ static void TrainDQSReceiverEnCyc_D_Fam15(struct MCTStatStruc *pMCTstat,
 				rx_en_offset = (initial_phy_phase_delay[lane] + 0x10) % 0x40;
 
 				/* 2.10.5.8.3 (4) */
+#if DQS_TRAIN_DEBUG > 0
+				printk(BIOS_DEBUG, "TrainDQSReceiverEnCyc_D_Fam15 Receiver %d lane %d initial phy delay %04x: iterating from %04x to %04x\n", Receiver, lane, initial_phy_phase_delay[lane], rx_en_offset, 0x3ff);
+#endif
 				for (current_phy_phase_delay[lane] = rx_en_offset; current_phy_phase_delay[lane] < 0x3ff; current_phy_phase_delay[lane] += ren_step) {
 					/* 2.10.5.8.3 (4 A) */
 					write_dqs_receiver_enable_control_registers(current_phy_phase_delay, dev, dct, dimm, index_reg);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c b/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c
index 539cb0d..1b81d15 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c
@@ -21,7 +21,7 @@
 static uint8_t AgesaHwWlPhase1(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct, u8 dimm, u8 pass);
 static uint8_t AgesaHwWlPhase2(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, u8 dct, u8 dimm, u8 pass);
+					struct DCTStatStruc *pDCTstat, uint8_t dct, uint8_t dimm, uint8_t pass);
 static uint8_t AgesaHwWlPhase3(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct, u8 dimm, u8 pass);
 static void EnableZQcalibration(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat);
@@ -133,7 +133,7 @@ static uint8_t PhyWLPass1(struct MCTStatStruc *pMCTstat,
 }
 
 static uint8_t PhyWLPass2(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, u8 dct)
+					struct DCTStatStruc *pDCTstat, uint8_t dct, uint8_t final)
 {
 	u8 dimm;
 	u16 DIMMValid;
@@ -187,12 +187,15 @@ static uint16_t fam15h_next_highest_memclk_freq(uint16_t memclk_freq)
  * Algorithm detailed in the Fam10h BKDG Rev. 3.62 section 2.8.9.9.1
  */
 static void WriteLevelization_HW(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, uint8_t Pass)
+					struct DCTStatStruc *pDCTstatA, uint8_t Node, uint8_t Pass)
 {
 	uint8_t status;
 	uint8_t timeout;
 	uint16_t final_target_freq;
 
+	struct DCTStatStruc *pDCTstat;
+	pDCTstat = pDCTstatA + Node;
+
 	pDCTstat->C_MCTPtr  = &(pDCTstat->s_C_MCTPtr);
 	pDCTstat->C_DCTPtr[0] = &(pDCTstat->s_C_DCTPtr[0]);
 	pDCTstat->C_DCTPtr[1] = &(pDCTstat->s_C_DCTPtr[1]);
@@ -240,13 +243,13 @@ static void WriteLevelization_HW(struct MCTStatStruc *pMCTstat,
 					pDCTstat->TargetFreq = fam15h_next_highest_memclk_freq(pDCTstat->Speed);
 				else
 					pDCTstat->TargetFreq = final_target_freq;
-				SetTargetFreq(pMCTstat, pDCTstat);
+				SetTargetFreq(pMCTstat, pDCTstatA, Node);
 				timeout = 0;
 				do {
 					status = 0;
 					timeout++;
-					status |= PhyWLPass2(pMCTstat, pDCTstat, 0);
-					status |= PhyWLPass2(pMCTstat, pDCTstat, 1);
+					status |= PhyWLPass2(pMCTstat, pDCTstat, 0, (pDCTstat->TargetFreq == final_target_freq));
+					status |= PhyWLPass2(pMCTstat, pDCTstat, 1, (pDCTstat->TargetFreq == final_target_freq));
 					if (status)
 						printk(BIOS_INFO,
 							"%s: Retrying write levelling due to invalid value(s) detected in last phase\n",
@@ -290,7 +293,7 @@ void mct_WriteLevelization_HW(struct MCTStatStruc *pMCTstat,
 		if (pDCTstat->NodePresent) {
 			mctSMBhub_Init(Node);
 			Clear_OnDimmMirror(pMCTstat, pDCTstat);
-			WriteLevelization_HW(pMCTstat, pDCTstat, Pass);
+			WriteLevelization_HW(pMCTstat, pDCTstatA, Node, Pass);
 			Restore_OnDimmMirror(pMCTstat, pDCTstat);
 		}
 	}
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctrci.c b/src/northbridge/amd/amdmct/mct_ddr3/mctrci.c
index 9617f84..624a543 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctrci.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctrci.c
@@ -18,6 +18,78 @@
  * Foundation, Inc.
  */
 
+static uint8_t fam15h_rdimm_rc2_control_code(struct DCTStatStruc *pDCTstat, uint8_t dct)
+{
+	uint8_t MaxDimmsInstallable = mctGet_NVbits(NV_MAX_DIMMS_PER_CH);
+
+	uint8_t package_type;
+	uint8_t control_code = 0;
+
+	package_type = mctGet_NVbits(NV_PACK_TYPE);
+	uint16_t MemClkFreq = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0x94) & 0x1f;
+
+	/* Obtain number of DIMMs on channel */
+	uint8_t dimm_count = pDCTstat->MAdimms[dct];
+
+	/* FIXME
+	 * Assume there is only one register on the RDIMM for now
+	 */
+	uint8_t num_registers = 1;
+
+	if (package_type == PT_GR) {
+		/* Socket G34 */
+		/* Fam15h BKDG Rev. 3.14 section 2.10.5.7.1.2.1 Table 85 */
+		if (MaxDimmsInstallable == 1) {
+			if ((MemClkFreq == 0x4) || (MemClkFreq == 0x6)) {
+				/* DDR3-667 - DDR3-800 */
+				control_code = 0x1;
+			} else if ((MemClkFreq == 0xa) || (MemClkFreq == 0xe)) {
+				/* DDR3-1066 - DDR3-1333 */
+				if (num_registers == 1) {
+					control_code = 0x0;
+				} else {
+					control_code = 0x1;
+				}
+			} else if ((MemClkFreq == 0x12) || (MemClkFreq == 0x16)) {
+				/* DDR3-1600 - DDR3-1866 */
+				control_code = 0x0;
+			}
+		} else if (MaxDimmsInstallable == 2) {
+			if (dimm_count == 1) {
+				/* 1 DIMM detected */
+				if ((MemClkFreq == 0x4) || (MemClkFreq == 0x6)) {
+					/* DDR3-667 - DDR3-800 */
+					control_code = 0x1;
+				} else if ((MemClkFreq >= 0xa) && (MemClkFreq <= 0x12)) {
+					/* DDR3-1066 - DDR3-1600 */
+					if (num_registers == 1) {
+						control_code = 0x0;
+					} else {
+						control_code = 0x1;
+					}
+				}
+			} else if (dimm_count == 2) {
+				/* 2 DIMMs detected */
+				if (num_registers == 1) {
+					control_code = 0x1;
+				} else {
+					control_code = 0x8;
+				}
+			}
+		} else if (MaxDimmsInstallable == 3) {
+			/* TODO
+			 * 3 DIMM/channel support unimplemented
+			 */
+		}
+	} else {
+		/* TODO
+		 * Other socket support unimplemented
+		 */
+	}
+
+	return control_code;
+}
+
 static uint16_t memclk_to_freq(uint16_t memclk) {
 	uint16_t fam10h_freq_tab[] = {0, 0, 0, 400, 533, 667, 800};
 	uint16_t fam15h_freq_tab[] = {0, 0, 0, 0, 333, 0, 400, 0, 0, 0, 533, 0, 0, 0, 667, 0, 0, 0, 800, 0, 0, 0, 933};
@@ -37,36 +109,46 @@ static uint16_t memclk_to_freq(uint16_t memclk) {
 	return mem_freq;
 }
 
+static uint8_t rc_word_chip_select_lower_bit(void) {
+	if (is_fam15h()) {
+		return 21;
+	} else {
+		return 20;
+	}
+}
+
+static uint32_t rc_word_address_to_ctl_bits(uint32_t address) {
+	if (is_fam15h()) {
+		return (((address >> 3) & 0x1) << 2) << 18 | (address & 0x7);
+	} else {
+		return (((address >> 3) & 0x1) << 2) << 16 | (address & 0x7);
+	}
+}
+
 static uint32_t rc_word_value_to_ctl_bits(uint32_t value) {
-	return ((value >> 2) & 3) << 16 | ((value & 3) << 3);
+	if (is_fam15h()) {
+		return ((value >> 2) & 0x3) << 18 | ((value & 0x3) << 3);
+	} else {
+		return ((value >> 2) & 0x3) << 16 | ((value & 0x3) << 3);
+	}
 }
 
 static u32 mct_ControlRC(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat, u32 MrsChipSel, u32 CtrlWordNum)
+			struct DCTStatStruc *pDCTstat, uint8_t dct, u32 MrsChipSel, u32 CtrlWordNum)
 {
 	u8 Dimms, DimmNum;
 	u32 val;
-	u32 dct = 0;
 	uint8_t ddr_voltage_index;
 	uint16_t mem_freq;
 	uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 	uint8_t MaxDimmsInstallable = mctGet_NVbits(NV_MAX_DIMMS_PER_CH);
 
-	DimmNum = (MrsChipSel >> 20) & 0xFE;
+	DimmNum = (MrsChipSel >> rc_word_chip_select_lower_bit()) & 0xfe;
 
-	/* assume dct=0; */
-	/* if (dct == 1) */
-	/* DimmNum ++; */
-	/* cl +=8; */
+	if (dct == 1)
+		DimmNum++;
 
 	mem_freq = memclk_to_freq(pDCTstat->DIMMAutoSpeed);
-
-	if (pDCTstat->CSPresent_DCT[0] > 0) {
-		dct = 0;
-	} else if (pDCTstat->CSPresent_DCT[1] > 0 ) {
-		dct = 1;
-		DimmNum++;
-	}
 	Dimms = pDCTstat->MAdimms[dct];
 
 	ddr_voltage_index = dct_ddr_voltage_index(pDCTstat, dct);
@@ -76,21 +158,25 @@ static u32 mct_ControlRC(struct MCTStatStruc *pMCTstat,
 		val = 0x2;
 	else if (CtrlWordNum == 1) {
 		if (!((pDCTstat->DimmDRPresent | pDCTstat->DimmQRPresent) & (1 << DimmNum)))
-			val = 0xC; /* if single rank, set DBA1 and DBA0 */
+			val = 0xc; /* if single rank, set DBA1 and DBA0 */
 	} else if (CtrlWordNum == 2) {
-		if (package_type == PT_GR) {
-			/* Socket G34 */
-			if (MaxDimmsInstallable == 2) {
-				if (Dimms > 1)
-					val = 0x4;
+		if (is_fam15h()) {
+			val = fam15h_rdimm_rc2_control_code(pDCTstat, dct);
+		} else {
+			if (package_type == PT_GR) {
+				/* Socket G34 */
+				if (MaxDimmsInstallable == 2) {
+					if (Dimms > 1)
+						val = 0x4;
+				}
 			}
 		}
 	} else if (CtrlWordNum == 3) {
-		val = (pDCTstat->CtrlWrd3 >> (DimmNum << 2)) & 0xFF;
+		val = (pDCTstat->CtrlWrd3 >> (DimmNum << 2)) & 0xff;
 	} else if (CtrlWordNum == 4) {
-		val = (pDCTstat->CtrlWrd4 >> (DimmNum << 2)) & 0xFF;
+		val = (pDCTstat->CtrlWrd4 >> (DimmNum << 2)) & 0xff;
 	} else if (CtrlWordNum == 5) {
-		val = (pDCTstat->CtrlWrd5 >> (DimmNum << 2)) & 0xFF;
+		val = (pDCTstat->CtrlWrd5 >> (DimmNum << 2)) & 0xff;
 	} else if (CtrlWordNum == 8) {
 		if (package_type == PT_GR) {
 			/* Socket G34 */
@@ -99,7 +185,7 @@ static u32 mct_ControlRC(struct MCTStatStruc *pMCTstat,
 			}
 		}
 	} else if (CtrlWordNum == 9) {
-		val = 0xD;	/* DBA1, DBA0, DA3 = 0 */
+		val = 0xd;	/* DBA1, DBA0, DA3 = 0 */
 	} else if (CtrlWordNum == 10) {
 		val = 0x0;	/* Lowest operating frequency */
 	} else if (CtrlWordNum == 11) {
@@ -114,43 +200,30 @@ static u32 mct_ControlRC(struct MCTStatStruc *pMCTstat,
 	}
 	val &= 0xf;
 
-	printk(BIOS_SPEW, "Preparing to send DIMM RC%d: %02x\n", CtrlWordNum, val);
+	printk(BIOS_SPEW, "Preparing to send DCT %d DIMM RC%d: %02x\n", dct, CtrlWordNum, val);
 
 	val = MrsChipSel | rc_word_value_to_ctl_bits(val);
-
-	/* transfer Control word number to address [BA2,A2,A1,A0] */
-	if (CtrlWordNum > 7) {
-		val |= 1 << 18;
-		CtrlWordNum &= 7;
-	}
-	val |= CtrlWordNum;
+	val |= rc_word_address_to_ctl_bits(CtrlWordNum);
 
 	return val;
 }
 
 static void mct_SendCtrlWrd(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat, u32 val)
+			struct DCTStatStruc *pDCTstat, uint8_t dct, uint32_t val)
 {
-	uint8_t dct = 0;
 	u32 dev = pDCTstat->dev_dct;
 
-	if (pDCTstat->CSPresent_DCT[0] > 0) {
-		dct = 0;
-	} else if (pDCTstat->CSPresent_DCT[1] > 0 ){
-		dct = 1;
-	}
-
-	val |= Get_NB32_DCT(dev, dct, 0x7C) & ~0xFFFFFF;
+	val |= Get_NB32_DCT(dev, dct, 0x7c) & ~0xffffff;
 	val |= 1 << SendControlWord;
-	Set_NB32_DCT(dev, dct, 0x7C, val);
+	Set_NB32_DCT(dev, dct, 0x7c, val);
 
 	do {
-		val = Get_NB32_DCT(dev, dct, 0x7C);
+		val = Get_NB32_DCT(dev, dct, 0x7c);
 	} while (val & (1 << SendControlWord));
 }
 
 void mct_DramControlReg_Init_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat, u8 dct)
+				struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	u8 MrsChipSel;
 	u32 dev = pDCTstat->dev_dct;
@@ -163,7 +236,7 @@ void mct_DramControlReg_Init_D(struct MCTStatStruc *pMCTstat,
 	for (MrsChipSel = 0; MrsChipSel < 8; MrsChipSel ++, MrsChipSel ++) {
 		if (pDCTstat->CSPresent & (1 << MrsChipSel)) {
 			val = Get_NB32_DCT(dev, dct, 0xa8);
-			val &= ~(0xF << 8);
+			val &= ~(0xf << 8);
 
 			switch (MrsChipSel) {
 				case 0:
@@ -184,8 +257,8 @@ void mct_DramControlReg_Init_D(struct MCTStatStruc *pMCTstat,
 			for (cw=0; cw <=15; cw ++) {
 				mct_Wait(1600);
 				if (!(cw==6 || cw==7)) {
-					val = mct_ControlRC(pMCTstat, pDCTstat, MrsChipSel << 20, cw);
-					mct_SendCtrlWrd(pMCTstat, pDCTstat, val);
+					val = mct_ControlRC(pMCTstat, pDCTstat, dct, MrsChipSel << rc_word_chip_select_lower_bit(), cw);
+					mct_SendCtrlWrd(pMCTstat, pDCTstat, dct, val);
 				}
 			}
 		}
@@ -195,7 +268,7 @@ void mct_DramControlReg_Init_D(struct MCTStatStruc *pMCTstat,
 }
 
 void FreqChgCtrlWrd(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat)
+			struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	u32 SaveSpeed = pDCTstat->DIMMAutoSpeed;
 	u32 MrsChipSel;
@@ -208,10 +281,10 @@ void FreqChgCtrlWrd(struct MCTStatStruc *pMCTstat,
 	for (MrsChipSel=0; MrsChipSel < 8; MrsChipSel++, MrsChipSel++) {
 		if (pDCTstat->CSPresent & (1 << MrsChipSel)) {
 			/* 2. Program F2x[1, 0]A8[CtrlWordCS]=bit mask for target chip selects. */
-			val = Get_NB32_DCT(dev, 0, 0xA8); /* TODO: dct 0 / 1 select */
-			val &= ~(0xFF << 8);
-			val |= (0x3 << (MrsChipSel & 0xFE)) << 8;
-			Set_NB32_DCT(dev, 0, 0xA8, val); /* TODO: dct 0 / 1 select */
+			val = Get_NB32_DCT(dev, dct, 0xa8);
+			val &= ~(0xff << 8);
+			val |= (0x3 << (MrsChipSel & 0xfe)) << 8;
+			Set_NB32_DCT(dev, dct, 0xa8, val);
 
 			/* Resend control word 10 */
 			uint8_t freq_ctl_val = 0;
@@ -235,21 +308,21 @@ void FreqChgCtrlWrd(struct MCTStatStruc *pMCTstat,
 					break;
 			}
 
-			printk(BIOS_SPEW, "Preparing to send DIMM RC%d: %02x\n", 10, freq_ctl_val);
+			printk(BIOS_SPEW, "Preparing to send DCT %d DIMM RC%d: %02x\n", dct, 10, freq_ctl_val);
 
-			mct_SendCtrlWrd(pMCTstat, pDCTstat, MrsChipSel << 20 | 0x40002 | rc_word_value_to_ctl_bits(freq_ctl_val));
+			mct_SendCtrlWrd(pMCTstat, pDCTstat, dct, MrsChipSel << rc_word_chip_select_lower_bit() | rc_word_address_to_ctl_bits(10) | rc_word_value_to_ctl_bits(freq_ctl_val));
 
 			mct_Wait(1600);
 
 			/* Resend control word 2 */
-			val = mct_ControlRC(pMCTstat, pDCTstat, MrsChipSel << 20, 2);
-			mct_SendCtrlWrd(pMCTstat, pDCTstat, val);
+			val = mct_ControlRC(pMCTstat, pDCTstat, dct, MrsChipSel << rc_word_chip_select_lower_bit(), 2);
+			mct_SendCtrlWrd(pMCTstat, pDCTstat, dct, val);
 
 			mct_Wait(1600);
 
 			/* Resend control word 8 */
-			val = mct_ControlRC(pMCTstat, pDCTstat, MrsChipSel << 20, 8);
-			mct_SendCtrlWrd(pMCTstat, pDCTstat, val);
+			val = mct_ControlRC(pMCTstat, pDCTstat, dct, MrsChipSel << rc_word_chip_select_lower_bit(), 8);
+			mct_SendCtrlWrd(pMCTstat, pDCTstat, dct, val);
 
 			mct_Wait(1600);
 		}
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c b/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c
index 9ccf77e..09a5f68 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c
@@ -445,13 +445,13 @@ static u32 mct_MR2(struct MCTStatStruc *pMCTstat,
 	u32 dev = pDCTstat->dev_dct;
 	u32 dword, ret;
 
+	/* The formula for chip select number is: CS = dimm*2+rank */
+	uint8_t dimm = MrsChipSel / 2;
+	uint8_t rank = MrsChipSel % 2;
+
 	if (is_fam15h()) {
 		uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 
-		/* The formula for chip select number is: CS = dimm*2+rank */
-		uint8_t dimm = MrsChipSel / 2;
-		uint8_t rank = MrsChipSel % 2;
-
 		/* FIXME: These parameters should be configurable
 		 * For now, err on the side of caution and enable automatic 2x refresh
 		 * when the DDR temperature rises above the internal limits
@@ -496,7 +496,7 @@ static u32 mct_MR2(struct MCTStatStruc *pMCTstat,
 		ret |= ((dword >> 10) & 3) << 9;
 	}
 
-	printk(BIOS_SPEW, "Going to send MR2 control word %08x\n", ret);
+	printk(BIOS_SPEW, "Going to send DCT %d DIMM %d rank %d MR2 control word %08x\n", dct, dimm, rank, ret);
 
 	return ret;
 }
@@ -507,6 +507,10 @@ static u32 mct_MR3(struct MCTStatStruc *pMCTstat,
 	u32 dev = pDCTstat->dev_dct;
 	u32 dword, ret;
 
+	/* The formula for chip select number is: CS = dimm*2+rank */
+	uint8_t dimm = MrsChipSel / 2;
+	uint8_t rank = MrsChipSel % 2;
+
 	if (is_fam15h()) {
 		ret = 0xc0000;
 		ret |= (MrsChipSel << 21);
@@ -527,7 +531,7 @@ static u32 mct_MR3(struct MCTStatStruc *pMCTstat,
 		ret |= (dword >> 24) & 7;
 	}
 
-	printk(BIOS_SPEW, "Going to send MR3 control word %08x\n", ret);
+	printk(BIOS_SPEW, "Going to send DCT %d DIMM %d rank %d MR3 control word %08x\n", dct, dimm, rank, ret);
 
 	return ret;
 }
@@ -538,6 +542,10 @@ static u32 mct_MR1(struct MCTStatStruc *pMCTstat,
 	u32 dev = pDCTstat->dev_dct;
 	u32 dword, ret;
 
+	/* The formula for chip select number is: CS = dimm*2+rank */
+	uint8_t dimm = MrsChipSel / 2;
+	uint8_t rank = MrsChipSel % 2;
+
 	if (is_fam15h()) {
 		uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 
@@ -553,10 +561,6 @@ static u32 mct_MR1(struct MCTStatStruc *pMCTstat,
 		ret = 0x40000;
 		ret |= (MrsChipSel << 21);
 
-		/* The formula for chip select number is: CS = dimm*2+rank */
-		uint8_t dimm = MrsChipSel / 2;
-		uint8_t rank = MrsChipSel % 2;
-
 		/* Determine if TQDS should be set */
 		if ((pDCTstat->Dimmx8Present & (1 << dimm))
 			&& (((dimm & 0x1)?(pDCTstat->Dimmx4Present&0x55):(pDCTstat->Dimmx4Present&0xaa)) != 0x0)
@@ -623,7 +627,7 @@ static u32 mct_MR1(struct MCTStatStruc *pMCTstat,
 			ret |= 1 << 12;
 	}
 
-	printk(BIOS_SPEW, "Going to send MR1 control word %08x\n", ret);
+	printk(BIOS_SPEW, "Going to send DCT %d DIMM %d rank %d MR1 control word %08x\n", dct, dimm, rank, ret);
 
 	return ret;
 }
@@ -634,6 +638,10 @@ static u32 mct_MR0(struct MCTStatStruc *pMCTstat,
 	u32 dev = pDCTstat->dev_dct;
 	u32 dword, ret, dword2;
 
+	/* The formula for chip select number is: CS = dimm*2+rank */
+	uint8_t dimm = MrsChipSel / 2;
+	uint8_t rank = MrsChipSel % 2;
+
 	if (is_fam15h()) {
 		ret = 0x00000;
 		ret |= (MrsChipSel << 21);
@@ -744,7 +752,7 @@ static u32 mct_MR0(struct MCTStatStruc *pMCTstat,
 		ret |= 1 << 8;
 	}
 
-	printk(BIOS_SPEW, "Going to send MR0 control word %08x\n", ret);
+	printk(BIOS_SPEW, "Going to send DCT %d DIMM %d rank %d MR0 control word %08x\n", dct, dimm, rank, ret);
 
 	return ret;
 }
@@ -811,6 +819,16 @@ void mct_DramInit_Sw_D(struct MCTStatStruc *pMCTstat,
 		/* 8.wait 360ns */
 		mct_Wait(80);
 
+		/* Set up address parity */
+		if ((pDCTstat->Status & (1 << SB_Registered))
+			|| (pDCTstat->Status & (1 << SB_LoadReduced))) {
+			if (is_fam15h()) {
+				dword = Get_NB32_DCT(dev, dct, 0x90);
+				dword |= 1 << ParEn;
+				Set_NB32_DCT(dev, dct, 0x90, dword);
+			}
+		}
+
 		/* The following steps are performed with registered DIMMs only and
 		 * must be done for each chip select pair */
 		if (pDCTstat->Status & (1 << SB_Registered))
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
index 981f467..707e6a9 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
@@ -1146,8 +1146,10 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 	uint8_t dimm;
 	uint8_t rank;
 	uint8_t lane;
+	uint8_t nibble;
 	uint8_t mem_clk;
 	uint16_t initial_seed;
+	uint8_t train_both_nibbles;
 	uint16_t current_total_delay[MAX_BYTE_LANES];
 	uint16_t dqs_ret_pass1_total_delay[MAX_BYTE_LANES];
 	uint16_t rank0_current_total_delay[MAX_BYTE_LANES];
@@ -1163,6 +1165,11 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 	print_debug_dqs("\nTrainRcvEn: Node", pDCTstat->Node_ID, 0);
 	print_debug_dqs("TrainRcvEn: Pass", Pass, 0);
 
+	train_both_nibbles = 0;
+	if (pDCTstat->Dimmx4Present)
+		if (is_fam15h())
+			train_both_nibbles = 1;
+
 	dev = pDCTstat->dev_dct;
 	index_reg = 0x98;
 	ch_start = 0;
@@ -1245,132 +1252,148 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 			else
 				_2Ranks = 0;
 			for (rank = 0; rank < (_2Ranks + 1); rank++) {
-				/* 2.10.5.8.2 (1)
-				 * Specify the target DIMM to be trained
-				 * Set TrNibbleSel = 0
-				 *
-				 * TODO: Add support for x4 DIMMs
-				 */
-				dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
-				dword &= ~(0x3 << 4);		/* TrDimmSel */
-				dword |= ((dimm & 0x3) << 4);
-				dword &= ~(0x1 << 2);		/* TrNibbleSel */
-				Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
-
-				/* 2.10.5.8.2 (2)
-				 * Retrieve gross and fine timing fields from write DQS registers
-				 */
-				read_dqs_write_timing_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
+				for (nibble = 0; nibble < (train_both_nibbles + 1); nibble++) {
+					/* 2.10.5.8.2 (1)
+					 * Specify the target DIMM and nibble to be trained
+					 */
+					dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
+					dword &= ~(0x3 << 4);		/* TrDimmSel = dimm */
+					dword |= ((dimm & 0x3) << 4);
+					dword &= ~(0x1 << 2);		/* TrNibbleSel = nibble */
+					dword |= ((nibble & 0x1) << 2);
+					Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
+
+					/* 2.10.5.8.2 (2)
+					 * Retrieve gross and fine timing fields from write DQS registers
+					 */
+					read_dqs_write_timing_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
 
-				/* 2.10.5.8.2.1
-				 * Generate the DQS Receiver Enable Training Seed Values
-				 */
-				if (Pass == FirstPass) {
-					initial_seed = fam15_receiver_enable_training_seed(pDCTstat, Channel, dimm, rank, package_type);
+					/* 2.10.5.8.2.1
+					 * Generate the DQS Receiver Enable Training Seed Values
+					 */
+					if (Pass == FirstPass) {
+						initial_seed = fam15_receiver_enable_training_seed(pDCTstat, Channel, dimm, rank, package_type);
 
-					/* Adjust seed for the minimum platform supported frequency */
-					initial_seed = (uint16_t) (((((uint64_t) initial_seed) *
-						fam15h_freq_tab[mem_clk] * 100) / (mctGet_NVbits(NV_MIN_MEMCLK) * 100)));
+						/* Adjust seed for the minimum platform supported frequency */
+						initial_seed = (uint16_t) (((((uint64_t) initial_seed) *
+							fam15h_freq_tab[mem_clk] * 100) / (mctGet_NVbits(NV_MIN_MEMCLK) * 100)));
 
-					for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
-						uint16_t wl_pass1_delay;
-						wl_pass1_delay = current_total_delay[lane];
+						for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+							uint16_t wl_pass1_delay;
+							wl_pass1_delay = current_total_delay[lane];
 
-						seed[lane] = initial_seed + wl_pass1_delay;
-					}
-				} else {
-					uint8_t addr_prelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
-					uint16_t register_delay;
-					int16_t seed_prescaling;
-
-					memcpy(current_total_delay, dqs_ret_pass1_total_delay, sizeof(current_total_delay));
-					if ((pDCTstat->Status & (1 << SB_Registered))) {
-						if (addr_prelaunch)
-							register_delay = 0x30;
-						else
-							register_delay = 0x20;
-					} else if ((pDCTstat->Status & (1 << SB_LoadReduced))) {
-						/* TODO
-						* Load reduced DIMM support unimplemented
-						*/
-						register_delay = 0x0;
+							seed[lane] = initial_seed + wl_pass1_delay;
+						}
 					} else {
-						register_delay = 0x0;
+						uint8_t addr_prelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
+						uint16_t register_delay;
+						int16_t seed_prescaling;
+
+						memcpy(current_total_delay, dqs_ret_pass1_total_delay, sizeof(current_total_delay));
+						if ((pDCTstat->Status & (1 << SB_Registered))) {
+							if (addr_prelaunch)
+								register_delay = 0x30;
+							else
+								register_delay = 0x20;
+						} else if ((pDCTstat->Status & (1 << SB_LoadReduced))) {
+							/* TODO
+							 * Load reduced DIMM support unimplemented
+							 */
+							register_delay = 0x0;
+						} else {
+							register_delay = 0x0;
+						}
+
+						for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+							seed_prescaling = current_total_delay[lane] - register_delay - 0x20;
+							seed[lane] = (uint16_t) (register_delay + ((((uint64_t) seed_prescaling) * fam15h_freq_tab[mem_clk] * 100) / (mctGet_NVbits(NV_MIN_MEMCLK) * 100)));
+						}
 					}
 
 					for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
-						seed_prescaling = current_total_delay[lane] - register_delay - 0x20;
-						seed[lane] = (uint16_t) (register_delay + ((((uint64_t) seed_prescaling) * fam15h_freq_tab[mem_clk] * 100) / (mctGet_NVbits(NV_MIN_MEMCLK) * 100)));
-					}
-				}
+						seed_gross[lane] = (seed[lane] >> 5) & 0x1f;
+						seed_fine[lane] = seed[lane] & 0x1f;
 
-				for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
-					seed_gross[lane] = (seed[lane] >> 5) & 0x1f;
-					seed_fine[lane] = seed[lane] & 0x1f;
+						/*if (seed_gross[lane] == 0)
+							seed_pre_gross[lane] = 0;
+						else */if (seed_gross[lane] & 0x1)
+							seed_pre_gross[lane] = 1;
+						else
+							seed_pre_gross[lane] = 2;
 
-					/*if (seed_gross[lane] == 0)
-						seed_pre_gross[lane] = 0;
-					else */if (seed_gross[lane] & 0x1)
-						seed_pre_gross[lane] = 1;
-					else
-						seed_pre_gross[lane] = 2;
+						/* Calculate phase recovery delays */
+						phase_recovery_delays[lane] = ((seed_pre_gross[lane] & 0x1f) << 5) | (seed_fine[lane] & 0x1f);
 
-					/* Calculate phase recovery delays */
-					phase_recovery_delays[lane] = ((seed_pre_gross[lane] & 0x1f) << 5) | (seed_fine[lane] & 0x1f);
+						/* Set the gross delay.
+						* NOTE: While the BKDG states to only program DqsRcvEnGrossDelay, this appears
+						* to have been a misprint as DqsRcvEnFineDelay should be set to zero as well.
+						*/
+						current_total_delay[lane] = ((seed_gross[lane] & 0x1f) << 5);
+					}
 
-					/* Set the gross delay.
-					 * NOTE: While the BKDG states to only program DqsRcvEnGrossDelay, this appears
-					 * to have been a misprint as DqsRcvEnFineDelay should be set to zero as well.
+					/* 2.10.5.8.2 (2) / 2.10.5.8.2.1 (5 6)
+					 * Program PhRecFineDly and PhRecGrossDly
 					 */
-					current_total_delay[lane] = ((seed_gross[lane] & 0x1f) << 5);
-				}
+					write_dram_phase_recovery_control_registers(phase_recovery_delays, dev, Channel, dimm, index_reg);
 
-				/* 2.10.5.8.2 (2) / 2.10.5.8.2.1 (5 6)
-				 * Program PhRecFineDly and PhRecGrossDly
-				 */
-				write_dram_phase_recovery_control_registers(phase_recovery_delays, dev, Channel, dimm, index_reg);
+					/* 2.10.5.8.2 (2) / 2.10.5.8.2.1 (7)
+					 * Program the DQS Receiver Enable delay values for each lane
+					 */
+					write_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
 
-				/* 2.10.5.8.2 (2) / 2.10.5.8.2.1 (7)
-				 * Program the DQS Receiver Enable delay values for each lane
-				 */
-				write_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
+					/* 2.10.5.8.2 (3)
+					 * Program DqsRcvTrEn = 1
+					 */
+					dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
+					dword |= (0x1 << 13);
+					Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
 
-				/* 2.10.5.8.2 (3)
-				 * Program DqsRcvTrEn = 1
-				 */
-				dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
-				dword |= (0x1 << 13);
-				Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
+					/* 2.10.5.8.2 (4)
+					 * Issue 192 read requests to the target rank
+					 */
+					generate_dram_receiver_enable_training_pattern_fam15(pMCTstat, pDCTstat, Channel, Receiver + (rank & 0x1));
 
-				/* 2.10.5.8.2 (4)
-				 * Issue 192 read requests to the target rank
-				 */
-				generate_dram_receiver_enable_training_pattern_fam15(pMCTstat, pDCTstat, Channel, Receiver + (rank & 0x1));
+					/* 2.10.5.8.2 (5)
+					 * Program DqsRcvTrEn = 0
+					 */
+					dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
+					dword &= ~(0x1 << 13);
+					Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
 
-				/* 2.10.5.8.2 (5)
-				 * Program DqsRcvTrEn = 0
-				 */
-				dword = Get_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008);
-				dword &= ~(0x1 << 13);
-				Set_NB32_index_wait_DCT(dev, Channel, index_reg, 0x00000008, dword);
+					/* 2.10.5.8.2 (6)
+					 * Read PhRecGrossDly, PhRecFineDly
+					 */
+					read_dram_phase_recovery_control_registers(phase_recovery_delays, dev, Channel, dimm, index_reg);
 
-				/* 2.10.5.8.2 (6)
-				 * Read PhRecGrossDly, PhRecFineDly
-				 */
-				read_dram_phase_recovery_control_registers(phase_recovery_delays, dev, Channel, dimm, index_reg);
+					/* 2.10.5.8.2 (7)
+					 * Calculate and program the DQS Receiver Enable delay values
+					 */
+					for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+						current_total_delay[lane] = (phase_recovery_delays[lane] & 0x1f);
+						current_total_delay[lane] |= ((seed_gross[lane] + ((phase_recovery_delays[lane] >> 5) & 0x1f) - seed_pre_gross[lane] + 1) << 5);
+						if (nibble == 0) {
+							if (lane == 8)
+								pDCTstat->CH_D_BC_RCVRDLY[Channel][dimm] = current_total_delay[lane];
+							else
+								pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane] = current_total_delay[lane];
+						} else {
+							/* 2.10.5.8.2 (1)
+							 * Average the trained values of both nibbles on x4 DIMMs
+							 */
+							if (lane == 8)
+								pDCTstat->CH_D_BC_RCVRDLY[Channel][dimm] = (pDCTstat->CH_D_BC_RCVRDLY[Channel][dimm] + current_total_delay[lane]) / 2;
+							else
+								pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane] = (pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane] + current_total_delay[lane]) / 2;
+						}
+					}
 
-				/* 2.10.5.8.2 (7)
-				 * Calculate and program the DQS Receiver Enable delay values
-				 */
-				for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
-					current_total_delay[lane] = (phase_recovery_delays[lane] & 0x1f);
-					current_total_delay[lane] |= ((seed_gross[lane] + ((phase_recovery_delays[lane] >> 5) & 0x1f) - seed_pre_gross[lane] + 1) << 5);
-					if (lane == 8)
-						pDCTstat->CH_D_BC_RCVRDLY[Channel][dimm] = current_total_delay[lane];
-					else
-						pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane] = current_total_delay[lane];
+#if DQS_TRAIN_DEBUG > 1
+					for (lane = 0; lane < 8; lane++)
+						printk(BIOS_DEBUG, "\t\tTrainRcvEn55: Channel: %d dimm: %d nibble: %d lane %d current_total_delay: %04x CH_D_B_RCVRDLY: %04x\n",
+							Channel, dimm, nibble, lane, current_total_delay[lane], pDCTstat->CH_D_B_RCVRDLY[Channel][dimm][lane]);
+#endif
+					write_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
 				}
-				write_dqs_receiver_enable_control_registers(current_total_delay, dev, Channel, dimm, index_reg);
 
 				if (rank == 0) {
 					/* Back up the Rank 0 delays for later use */
@@ -1395,7 +1418,7 @@ static void dqsTrainRcvrEn_SW_Fam15(struct MCTStatStruc *pMCTstat,
 
 #if DQS_TRAIN_DEBUG > 0
 			for (lane = 0; lane < 8; lane++)
-				print_debug_dqs_pair("\t\tTrainRcvEn55: Lane ", lane, " current_total_delay ", current_total_delay[lane], 2);
+				print_debug_dqs_pair("\t\tTrainRcvEn56: Lane ", lane, " current_total_delay ", current_total_delay[lane], 2);
 #endif
 		}
 	}
@@ -1815,15 +1838,23 @@ void mctSetEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat,
 }
 
 void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstatA)
+			struct DCTStatStruc *pDCTstatA, int16_t single_node_number)
 {
 	u8 Node = 0;
 	struct DCTStatStruc *pDCTstat;
 
 	printk(BIOS_DEBUG, "%s: Start\n", __func__);
 
+	uint8_t start_node = 0;
+	uint8_t end_node = MAX_NODES_SUPPORTED;
+
+	if (single_node_number >= 0) {
+		start_node = single_node_number;
+		end_node = single_node_number;
+	}
+
 	/* FIXME: skip for Ax */
-	for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) {
+	for (Node = start_node; Node < end_node; Node++) {
 		pDCTstat = pDCTstatA + Node;
 		if (!pDCTstat->NodePresent)
 			continue;
@@ -1847,6 +1878,8 @@ void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat,
 					if (!pDCTstat->DIMMValidDCT[dct])
 						continue;
 
+					printk(BIOS_SPEW, "%s: training node %d DCT %d\n", __func__, Node, dct);
+
 					/* Back up D18F2x9C_x0000_0004_dct[1:0] */
 					datc_backup = Get_NB32_index_wait_DCT(dev, dct, index_reg, 0x00000004);
 
@@ -1985,6 +2018,8 @@ void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat,
 
 					/* Restore D18F2x9C_x0000_0004_dct[1:0] */
 					Set_NB32_index_wait_DCT(dev, dct, index_reg, 0x00000004, datc_backup);
+
+					printk(BIOS_SPEW, "%s: done training node %d DCT %d\n", __func__, Node, dct);
 				}
 			} else {
 				fenceDynTraining_D(pMCTstat, pDCTstat, 0);
@@ -1997,7 +2032,7 @@ void phyAssistedMemFnceTraining(struct MCTStatStruc *pMCTstat,
 }
 
 static uint32_t fenceDynTraining_D(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat, u8 dct)
+			struct DCTStatStruc *pDCTstat, uint8_t dct)
 {
 	u16 avRecValue;
 	u32 val;
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c b/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
index 6b63ba0..3153e46 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
@@ -19,7 +19,7 @@
  */
 
 static void FreqChgCtrlWrd(struct MCTStatStruc *pMCTstat,
-			struct DCTStatStruc *pDCTstat);
+			struct DCTStatStruc *pDCTstat, uint8_t dct);
 
 
 static void AgesaDelay(u32 msec)
@@ -353,11 +353,14 @@ static void ExitSelfRefresh(struct MCTStatStruc *pMCTstat,
 }
 
 void SetTargetFreq(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat)
+					struct DCTStatStruc *pDCTstatA, uint8_t Node)
 {
 	uint32_t dword;
 	uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 
+	struct DCTStatStruc *pDCTstat;
+	pDCTstat = pDCTstatA + Node;
+
 	if (is_fam15h()) {
 		/* Program F2x[1, 0]90[DisDllShutDownSR]=1. */
 		if (pDCTstat->DIMMValidDCT[0]) {
@@ -391,7 +394,7 @@ void SetTargetFreq(struct MCTStatStruc *pMCTstat,
 		uint8_t dct;
 		for (dct = 0; dct < 2; dct++) {
 			if (pDCTstat->DIMMValidDCT[dct]) {
-				phyAssistedMemFnceTraining(pMCTstat, pDCTstat);
+				phyAssistedMemFnceTraining(pMCTstat, pDCTstatA, Node);
 				InitPhyCompensation(pMCTstat, pDCTstat, dct);
 			}
 		}
@@ -438,7 +441,12 @@ void SetTargetFreq(struct MCTStatStruc *pMCTstat,
 		else
 			pDCTstat->CSPresent = pDCTstat->CSPresent_DCT[1];
 
-		FreqChgCtrlWrd(pMCTstat, pDCTstat);
+		if (pDCTstat->DIMMValidDCT[0]) {
+			FreqChgCtrlWrd(pMCTstat, pDCTstat, 0);
+		}
+		if (pDCTstat->DIMMValidDCT[1]) {
+			FreqChgCtrlWrd(pMCTstat, pDCTstat, 1);
+		}
 	}
 }
 
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c
index e5e4031..73b231e 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c
@@ -35,9 +35,9 @@ u32 swapBankBits(struct DCTStatStruc *pDCTstat, uint8_t dct, uint32_t MRSValue);
 void prepareDimms(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat,
 	u8 dct, u8 dimm, BOOL wl);
 void programODT(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, u8 dimm);
-void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, u8 dimm, u8 pass);
+void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, uint8_t dimm, uint8_t pass, uint8_t nibble);
 void setWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, u8 targetAddr, uint8_t pass);
-void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, uint8_t pass);
+void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, uint8_t pass, uint8_t nibble);
 
 static int32_t abs(int32_t val) {
 	if (val < 0)
@@ -76,6 +76,8 @@ uint8_t AgesaHwWlPhase1(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 {
 	u8 ByteLane;
 	u32 Value, Addr;
+	uint8_t nibble = 0;
+	uint8_t train_both_nibbles;
 	u16 Addl_Data_Offset, Addl_Data_Port;
 	sMCTStruct *pMCTData = pDCTstat->C_MCTPtr;
 	sDCTStruct *pDCTData = pDCTstat->C_DCTPtr[dct];
@@ -88,98 +90,108 @@ uint8_t AgesaHwWlPhase1(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 			DRAM_ADD_DCT_PHY_CONTROL_REG, TrDimmSelStart,
 			TrDimmSelEnd, (u32)dimm);
 
-	if (is_fam15h()) {
-		/* Set TrNibbleSel = 0
-		 *
-		 * TODO: Add support for x4 DIMMs
-		 */
-		set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
-				DRAM_ADD_DCT_PHY_CONTROL_REG, 2,
-				2, (u32)0);
-	}
+	train_both_nibbles = 0;
+	if (pDCTstat->Dimmx4Present)
+		if (is_fam15h())
+			train_both_nibbles = 1;
 
-	/* 2. Prepare the DIMMs for write levelization using DDR3-defined
-	 * MR commands. */
-	prepareDimms(pMCTstat, pDCTstat, dct, dimm, TRUE);
+	for (nibble = 0; nibble < (train_both_nibbles + 1); nibble++) {
+		printk(BIOS_SPEW, "AgesaHwWlPhase1: training nibble %d\n", nibble);
 
-	/* 3. After the DIMMs are configured, BIOS waits 40 MEMCLKs to
-	 *    satisfy DDR3-defined internal DRAM timing.
-	 */
-	if (is_fam15h())
-		precise_memclk_delay_fam15(pMCTstat, pDCTstat, dct, 40);
-	else
-		pMCTData->AgesaDelay(40);
+		if (is_fam15h()) {
+			/* Program F2x[1, 0]9C_x08[WrtLvTrEn]=0 */
+			set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
+					DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 0);
+
+			/* Set TrNibbleSel */
+			set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
+					DRAM_ADD_DCT_PHY_CONTROL_REG, 2,
+					2, (uint32_t)nibble);
+		}
 
-	/* 4. Configure the processor's DDR phy for write levelization training: */
-	procConfig(pMCTstat, pDCTstat, dct, dimm, pass);
+		/* 2. Prepare the DIMMs for write levelization using DDR3-defined
+		 * MR commands. */
+		prepareDimms(pMCTstat, pDCTstat, dct, dimm, TRUE);
 
-	/* 5. Begin write levelization training:
-	 *  Program F2x[1, 0]9C_x08[WrtLvTrEn]=1. */
-	if (pDCTData->LogicalCPUID & (AMD_DR_Cx | AMD_DR_Dx | AMD_FAM15_ALL))
-	{
-		set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
-				DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 1);
-	}
-	else
-	{
-		/* Broadcast write to all D3Dbyte chipset register offset 0xc
-		 * Set bit 0 (wrTrain)
-		 * Program bit 4 to nibble being trained (only matters for x4dimms)
-		 * retain value of 3:2 (Trdimmsel)
-		 * reset bit 5 (FrzPR)
+		/* 3. After the DIMMs are configured, BIOS waits 40 MEMCLKs to
+		 *    satisfy DDR3-defined internal DRAM timing.
 		 */
-		if (dct)
+		if (is_fam15h())
+			precise_memclk_delay_fam15(pMCTstat, pDCTstat, dct, 40);
+		else
+			pMCTData->AgesaDelay(40);
+
+		/* 4. Configure the processor's DDR phy for write levelization training: */
+		procConfig(pMCTstat, pDCTstat, dct, dimm, pass, nibble);
+
+		/* 5. Begin write levelization training:
+		 *  Program F2x[1, 0]9C_x08[WrtLvTrEn]=1. */
+		if (pDCTData->LogicalCPUID & (AMD_DR_Cx | AMD_DR_Dx | AMD_FAM15_ALL))
 		{
-			Addl_Data_Offset=0x198;
-			Addl_Data_Port=0x19C;
+			set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
+					DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 1);
 		}
 		else
 		{
-			Addl_Data_Offset=0x98;
-			Addl_Data_Port=0x9C;
+			/* Broadcast write to all D3Dbyte chipset register offset 0xc
+			 * Set bit 0 (wrTrain)
+			 * Program bit 4 to nibble being trained (only matters for x4dimms)
+			 * retain value of 3:2 (Trdimmsel)
+			 * reset bit 5 (FrzPR)
+			 */
+			if (dct)
+			{
+				Addl_Data_Offset=0x198;
+				Addl_Data_Port=0x19C;
+			}
+			else
+			{
+				Addl_Data_Offset=0x98;
+				Addl_Data_Port=0x9C;
+			}
+			Addr=0x0D00000C;
+			AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Offset), 31, 0, &Addr);
+			while ((get_Bits(pDCTData,FUN_DCT,pDCTData->NodeId, FUN_DCT, Addl_Data_Offset,
+					DctAccessDone, DctAccessDone)) == 0);
+			AmdMemPCIReadBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Port), 31, 0, &Value);
+			Value = bitTestSet(Value, 0);	/* enable WL training */
+			Value = bitTestReset(Value, 4); /* for x8 only */
+			Value = bitTestReset(Value, 5); /* for hardware WL training */
+			AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Port), 31, 0, &Value);
+			Addr=0x4D030F0C;
+			AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Offset), 31, 0, &Addr);
+			while ((get_Bits(pDCTData,FUN_DCT,pDCTData->NodeId, FUN_DCT, Addl_Data_Offset,
+					DctAccessDone, DctAccessDone)) == 0);
 		}
-		Addr=0x0D00000C;
-		AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Offset), 31, 0, &Addr);
-		while ((get_Bits(pDCTData,FUN_DCT,pDCTData->NodeId, FUN_DCT, Addl_Data_Offset,
-				DctAccessDone, DctAccessDone)) == 0);
-		AmdMemPCIReadBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Port), 31, 0, &Value);
-		Value = bitTestSet(Value, 0);	/* enable WL training */
-		Value = bitTestReset(Value, 4); /* for x8 only */
-		Value = bitTestReset(Value, 5); /* for hardware WL training */
-		AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Port), 31, 0, &Value);
-		Addr=0x4D030F0C;
-		AmdMemPCIWriteBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId),FUN_DCT,Addl_Data_Offset), 31, 0, &Addr);
-		while ((get_Bits(pDCTData,FUN_DCT,pDCTData->NodeId, FUN_DCT, Addl_Data_Offset,
-				DctAccessDone, DctAccessDone)) == 0);
-	}
 
-	if (is_fam15h())
-		proc_MFENCE();
+		if (is_fam15h())
+			proc_MFENCE();
 
-	/* Wait 200 MEMCLKs. If executing pass 2, wait 32 MEMCLKs. */
-	if (is_fam15h())
-		precise_memclk_delay_fam15(pMCTstat, pDCTstat, dct, 200);
-	else
-		pMCTData->AgesaDelay(140);
+		/* Wait 200 MEMCLKs. If executing pass 2, wait 32 MEMCLKs. */
+		if (is_fam15h())
+			precise_memclk_delay_fam15(pMCTstat, pDCTstat, dct, 200);
+		else
+			pMCTData->AgesaDelay(140);
 
-	/* Program F2x[1, 0]9C_x08[WrtLevelTrEn]=0. */
-	set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
-			DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 0);
+		/* Program F2x[1, 0]9C_x08[WrtLevelTrEn]=0. */
+		set_DCT_ADDR_Bits(pDCTData, dct, pDCTData->NodeId, FUN_DCT,
+				DRAM_ADD_DCT_PHY_CONTROL_REG, WrtLvTrEn, WrtLvTrEn, 0);
 
-	/* Read from registers F2x[1, 0]9C_x[51:50] and F2x[1, 0]9C_x52
-	 * to get the gross and fine delay settings
-	 * for the target DIMM and save these values. */
-	for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
-		getWLByteDelay(pDCTstat, dct, ByteLane, dimm, pass);
-	}
+		/* Read from registers F2x[1, 0]9C_x[51:50] and F2x[1, 0]9C_x52
+		 * to get the gross and fine delay settings
+		 * for the target DIMM and save these values. */
+		for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+			getWLByteDelay(pDCTstat, dct, ByteLane, dimm, pass, nibble);
+		}
 
-	pDCTData->WLCriticalGrossDelayPrevPass = 0x1f;
+		pDCTData->WLCriticalGrossDelayPrevPass = 0x0;
+	}
 
 	return 0;
 }
 
 uint8_t AgesaHwWlPhase2(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat,
-		u8 dct, u8 dimm, u8 pass)
+		uint8_t dct, uint8_t dimm, uint8_t pass)
 {
 	u8 ByteLane;
 	uint8_t status = 0;
@@ -190,6 +202,12 @@ uint8_t AgesaHwWlPhase2(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 		int32_t cgd = pDCTData->WLCriticalGrossDelayPrevPass;
 		uint8_t index = (uint8_t)(MAX_BYTE_LANES * dimm);
 
+		printk(BIOS_SPEW, "\toriginal critical gross delay: %d\n", cgd);
+
+		/* FIXME
+		 * For now, disable CGD adjustment as it seems to interfere with registered DIMM training
+		 */
+
 		/* Calculate the Critical Gross Delay */
 		for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
 			/* Calculate the gross delay differential for this lane */
@@ -205,6 +223,8 @@ uint8_t AgesaHwWlPhase2(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 				cgd = gross_diff[ByteLane];
 		}
 
+		printk(BIOS_SPEW, "\tnew critical gross delay: %d\n", cgd);
+
 		pDCTData->WLCriticalGrossDelayPrevPass = cgd;
 
 		if (pDCTstat->Speed != pDCTstat->TargetFreq) {
@@ -281,7 +301,7 @@ uint8_t AgesaHwWlPhase3(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 				gross_diff[ByteLane] = pDCTData->WLSeedGrossDelay[index+ByteLane] + pDCTData->WLGrossDelay[index+ByteLane];
 				gross_diff[ByteLane] -= pDCTData->WLSeedPreGrossDelay[index+ByteLane];
 
-				/* Prevent underflow in the presence of noise / instability*/
+				/* Prevent underflow in the presence of noise / instability */
 				if (gross_diff[ByteLane] < cgd)
 					gross_diff[ByteLane] = cgd;
 
@@ -289,7 +309,8 @@ uint8_t AgesaHwWlPhase3(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCT
 			}
 		} else {
 			dword = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0xa8);
-			dword &= ~(0x3 << 24);			/* WrDqDqsEarly = 0 */
+			dword &= ~(0x3 << 24);			/* WrDqDqsEarly = pDCTData->WrDqsGrossDlyBaseOffset */
+			dword |= ((pDCTData->WrDqsGrossDlyBaseOffset & 0x3) << 24);
 			Set_NB32_DCT(pDCTstat->dev_dct, dct, 0xa8, dword);
 		}
 	}
@@ -959,7 +980,7 @@ static uint16_t fam15h_next_lowest_memclk_freq(uint16_t memclk_freq)
 #endif
 
 /*-----------------------------------------------------------------------------
- * void procConfig(MCTStruct *MCTData,DCTStruct *DCTData, u8 Dimm, u8 Pass)
+ * void procConfig(MCTStruct *MCTData,DCTStruct *DCTData, u8 Dimm, u8 Pass, u8 Nibble)
  *
  *  Description:
  *       This function programs the ODT values for the NB
@@ -972,13 +993,14 @@ static uint16_t fam15h_next_lowest_memclk_freq(uint16_t memclk_freq)
  *       OUT
  * ----------------------------------------------------------------------------
  */
-void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, u8 dimm, u8 pass)
+void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, uint8_t dct, uint8_t dimm, uint8_t pass, uint8_t nibble)
 {
 	u8 ByteLane, MemClkFreq;
 	int32_t Seed_Gross;
 	int32_t Seed_Fine;
 	uint8_t Seed_PreGross;
 	u32 Value, Addr;
+	uint32_t dword;
 	u16 Addl_Data_Offset, Addl_Data_Port;
 	sMCTStruct *pMCTData = pDCTstat->C_MCTPtr;
 	sDCTStruct *pDCTData = pDCTstat->C_DCTPtr[dct];
@@ -1048,10 +1070,17 @@ void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, ui
 			uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
 			uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
 			uint16_t Seed_Total = 0;
+			pDCTData->WrDqsGrossDlyBaseOffset = 0x0;
 			if (package_type == PT_GR) {
 				/* Socket G34: Fam15h BKDG v3.14 Table 96 */
 				if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
+					/* TODO
+					 * Implement mainboard-specific seed and
+					 * WrDqsGrossDly base overrides.
+					 * 0x41 and 0x0 are the "stock" values
+					 */
 					Seed_Total = 0x41;
+					pDCTData->WrDqsGrossDlyBaseOffset = 0x2;
 				} else if (pDCTData->Status[DCT_STATUS_LOAD_REDUCED]) {
 					Seed_Total = 0x0;
 				} else {
@@ -1133,15 +1162,16 @@ void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, ui
 			printk(BIOS_SPEW, "\tLane %02x initial seed: %04x\n", ByteLane, ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f));
 		}
 	} else {
-		/* Pass 2 */
-		/* From BKDG, Write Leveling Seed Value. */
-		if (is_fam15h()) {
-			uint32_t RegisterDelay;
-			int32_t SeedTotal;
-			int32_t SeedTotalPreScaling;
-			uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
+		if (nibble == 0) {
+			/* Pass 2 */
+			/* From BKDG, Write Leveling Seed Value. */
+			if (is_fam15h()) {
+				uint32_t RegisterDelay;
+				int32_t SeedTotal[MAX_BYTE_LANES];
+				int32_t SeedTotalPreScaling[MAX_BYTE_LANES];
+				uint32_t WrDqDqsEarly;
+				uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
 
-			for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
 				if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
 					if (AddrCmdPrelaunch)
 						RegisterDelay = 0x30;
@@ -1150,84 +1180,133 @@ void procConfig(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, ui
 				} else {
 					RegisterDelay = 0;
 				}
+
 				/* Retrieve WrDqDqsEarly */
-				AmdMemPCIReadBits(MAKE_SBDFO(0,0,24+(pDCTData->NodeId), FUN_DCT, 0xa8), 25, 24, &Value);
+				dword = Get_NB32_DCT(pDCTstat->dev_dct, dct, 0xa8);
+				WrDqDqsEarly = (dword >> 24) & 0x3;
 
-				/* Calculate adjusted seed values */
-				SeedTotal = (pDCTData->WLFineDelayPrevPass[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) |
-					((pDCTData->WLGrossDelayPrevPass[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5);
-				SeedTotalPreScaling = (SeedTotal - RegisterDelay - (0x20 * Value));
-				SeedTotal = (int32_t) (RegisterDelay + ((((int64_t) SeedTotalPreScaling) *
-					fam15h_freq_tab[MemClkFreq] * 100) / (fam15h_freq_tab[pDCTData->WLPrevMemclkFreq] * 100)));
+				/* FIXME
+				 * Ignore WrDqDqsEarly for now to work around training issues
+				 */
+				WrDqDqsEarly = 0;
 
-				if (SeedTotal >= 0) {
-					Seed_Gross = SeedTotal / 32;
-					Seed_Fine = SeedTotal % 32;
-				} else {
-					Seed_Gross = (SeedTotal / 32) - 1;
-					Seed_Fine = (SeedTotal % 32) + 32;
+				/* Generate new seed values */
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+					/* Calculate adjusted seed values */
+					SeedTotal[ByteLane] = (pDCTData->WLFineDelayPrevPass[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) |
+						((pDCTData->WLGrossDelayPrevPass[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5);
+					SeedTotalPreScaling[ByteLane] = (SeedTotal[ByteLane] - RegisterDelay - (0x20 * WrDqDqsEarly));
+					SeedTotal[ByteLane] = (int32_t) (RegisterDelay + ((((int64_t) SeedTotalPreScaling[ByteLane]) *
+						fam15h_freq_tab[MemClkFreq] * 100) / (fam15h_freq_tab[pDCTData->WLPrevMemclkFreq] * 100)));
 				}
 
-				if (Seed_Gross == 0)
-					Seed_PreGross = 0;
-				else if (Seed_Gross & 0x1)
-					Seed_PreGross = 1;
-				else
-					Seed_PreGross = 2;
+				/* Generate register values from seeds */
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+					printk(BIOS_SPEW, "\tLane %02x scaled delay: %04x\n", ByteLane, SeedTotal[ByteLane]);
 
-				/* Save seed values for later use */
-				pDCTData->WLSeedGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
-				pDCTData->WLSeedFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
-				pDCTData->WLSeedPreGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_PreGross;
+					if (SeedTotal[ByteLane] >= 0) {
+						Seed_Gross = SeedTotal[ByteLane] / 32;
+						Seed_Fine = SeedTotal[ByteLane] % 32;
+					} else {
+						Seed_Gross = (SeedTotal[ByteLane] / 32) - 1;
+						Seed_Fine = (SeedTotal[ByteLane] % 32) + 32;
+					}
 
-				pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_PreGross;
-				pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+					if (Seed_Gross == 0)
+						Seed_PreGross = 0;
+					else if (Seed_Gross & 0x1)
+						Seed_PreGross = 1;
+					else
+						Seed_PreGross = 2;
 
-				printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f));
-			}
-		} else {
-			uint32_t RegisterDelay;
-			uint32_t SeedTotalPreScaling;
-			uint32_t SeedTotal;
-			uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
-			for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++)
-			{
-				if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
-					if (AddrCmdPrelaunch == 0)
-						RegisterDelay = 0x20;
+					/* The BKDG-recommended algorithm causes problems with registered DIMMs on some systems
+					 * due to the long register delays causing premature total delay wrap-around.
+					 * Attempt to work around this...
+					 */
+					Seed_PreGross = Seed_Gross;
+
+					/* Save seed values for later use */
+					pDCTData->WLSeedGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
+					pDCTData->WLSeedFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+					pDCTData->WLSeedPreGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_PreGross;
+
+					pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_PreGross;
+					pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+
+					printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f));
+				}
+			} else {
+				uint32_t RegisterDelay;
+				uint32_t SeedTotalPreScaling;
+				uint32_t SeedTotal;
+				uint8_t AddrCmdPrelaunch = 0;		/* TODO: Fetch the correct value from RC2[0] */
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++)
+				{
+					if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
+						if (AddrCmdPrelaunch == 0)
+							RegisterDelay = 0x20;
+						else
+							RegisterDelay = 0x30;
+					} else {
+						RegisterDelay = 0;
+					}
+					SeedTotalPreScaling = ((pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) |
+						(pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] << 5)) - RegisterDelay;
+					/* SeedTotalPreScaling = (the total delay value in F2x[1, 0]9C_x[4A:30] from pass 1 of write levelization
+					training) - RegisterDelay. */
+					SeedTotal = (uint16_t) ((((uint64_t) SeedTotalPreScaling) *
+										fam10h_freq_tab[MemClkFreq] * 100) / (fam10h_freq_tab[3] * 100));
+					Seed_Gross = SeedTotal / 32;
+					Seed_Fine = SeedTotal & 0x1f;
+					if (Seed_Gross == 0)
+						Seed_Gross = 0;
+					else if (Seed_Gross & 0x1)
+						Seed_Gross = 1;
 					else
-						RegisterDelay = 0x30;
-				} else {
-					RegisterDelay = 0;
+						Seed_Gross = 2;
+
+					/* The BKDG-recommended algorithm causes problems with registered DIMMs on some systems
+					* due to the long register delays causing premature total delay wrap-around.
+					* Attempt to work around this...
+					*/
+					SeedTotal = ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f);
+					SeedTotal += RegisterDelay;
+					Seed_Gross = SeedTotal / 32;
+					Seed_Fine = SeedTotal & 0x1f;
+
+					pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
+					pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+
+					printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f));
 				}
-				SeedTotalPreScaling = ((pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) |
-					(pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] << 5)) - RegisterDelay;
-				/* SeedTotalPreScaling = (the total delay value in F2x[1, 0]9C_x[4A:30] from pass 1 of write levelization
-				training) - RegisterDelay. */
-				SeedTotal = (uint16_t) ((((uint64_t) SeedTotalPreScaling) *
-									fam10h_freq_tab[MemClkFreq] * 100) / (fam10h_freq_tab[3] * 100));
-				Seed_Gross = SeedTotal / 32;
-				Seed_Fine = SeedTotal & 0x1f;
-				if (Seed_Gross == 0)
-					Seed_Gross = 0;
-				else if (Seed_Gross & 0x1)
-					Seed_Gross = 1;
-				else
-					Seed_Gross = 2;
+			}
 
-				/* The BKDG-recommended algorithm causes problems with registered DIMMs on some systems
-				 * due to the long register delays causing premature total delay wrap-around.
-				 * Attempt to work around this...
-				 */
-				SeedTotal = ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f);
-				SeedTotal += RegisterDelay;
-				Seed_Gross = SeedTotal / 32;
-				Seed_Fine = SeedTotal & 0x1f;
+			/* Save initial seeds for upper nibble pass */
+			for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+				pDCTData->WLSeedPreGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedPreGrossDelay[MAX_BYTE_LANES*dimm+ByteLane];
+				pDCTData->WLSeedGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane];
+				pDCTData->WLSeedFinePrevNibble[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane];
+			}
+		} else {
+			/* Restore seed values from lower nibble pass */
+			if (is_fam15h()) {
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+					pDCTData->WLSeedGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+					pDCTData->WLSeedFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedFinePrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+					pDCTData->WLSeedPreGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedPreGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
 
-				pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
-				pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
+					pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedPreGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+					pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedFinePrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
 
-				printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((Seed_Gross & 0x1f) << 5) | (Seed_Fine & 0x1f));
+					printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f));
+				}
+			} else {
+				for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++) {
+					pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedGrossPrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+					pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = pDCTData->WLSeedFinePrevNibble[MAX_BYTE_LANES*dimm+ByteLane];
+
+					printk(BIOS_SPEW, "\tLane %02x new seed: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f));
+				}
 			}
 		}
 	}
@@ -1358,7 +1437,7 @@ void setWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8
 }
 
 /*-----------------------------------------------------------------------------
- *  void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 Dimm)
+ *  void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 Dimm, u8 Nibble)
  *
  *  Description:
  *       This function reads the write levelization byte delay from the Phase
@@ -1376,7 +1455,7 @@ void setWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8
  *
  *-----------------------------------------------------------------------------
  */
-void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, uint8_t pass)
+void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8 dimm, uint8_t pass, uint8_t nibble)
 {
 	sDCTStruct *pDCTData = pDCTstat->C_DCTPtr[dct];
 	u8 fineStartLoc, fineEndLoc, grossStartLoc, grossEndLoc, tempB, tempB1, index;
@@ -1427,7 +1506,16 @@ void getWLByteDelay(struct DCTStatStruc *pDCTstat, uint8_t dct, u8 ByteLane, u8
 			fine = 0;
 		}
 	}
-	pDCTData->WLFineDelay[index+ByteLane] = (u8)fine;
-	pDCTData->WLGrossDelay[index+ByteLane] = (u8)gross;
-	printk(BIOS_SPEW, "\tLane %02x final adjusted value: %04x\n", ByteLane, ((gross & 0x1f) << 5) | (fine & 0x1f));
+	if (nibble == 0) {
+		pDCTData->WLFineDelay[index+ByteLane] = (uint8_t)fine;
+		pDCTData->WLGrossDelay[index+ByteLane] = (uint8_t)gross;
+	} else {
+		uint32_t WLTotalDelay = ((pDCTData->WLGrossDelay[index+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[index+ByteLane] & 0x1f);
+		WLTotalDelay += ((gross & 0x1f) << 5) | (fine & 0x1f);
+		WLTotalDelay /= 2;
+		pDCTData->WLFineDelay[index+ByteLane] = (uint8_t)(WLTotalDelay & 0x1f);
+		pDCTData->WLGrossDelay[index+ByteLane] = (uint8_t)((WLTotalDelay >> 5) & 0x1f);
+	}
+
+	printk(BIOS_SPEW, "\tLane %02x adjusted value: %04x\n", ByteLane, ((pDCTData->WLGrossDelay[index+ByteLane] & 0x1f) << 5) | (pDCTData->WLFineDelay[index+ByteLane] & 0x1f));
 }
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h b/src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h
index 12e7c4a..3337c14 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mwlc_d.h
@@ -119,16 +119,21 @@ typedef struct _sDCTStruct
 	u8 DctTrain;			/* Current DCT being trained */
 	u8 CurrDct;			/* Current DCT number (0 or 1) */
 	u8 DctCSPresent;		/* Current DCT CS mapping */
+	uint8_t WrDqsGrossDlyBaseOffset;
 	int32_t WLSeedGrossDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Seed Gross Delay */
 								/* per byte Lane Per Logical DIMM*/
 	int32_t WLSeedFineDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Seed Fine Delay */
 								/* per byte Lane Per Logical DIMM*/
 	int32_t WLSeedPreGrossDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Seed Pre-Gross Delay */
 								/* per byte Lane Per Logical DIMM*/
-	u8 WLGrossDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Gross Delay */
-							/* per byte Lane Per Logical DIMM*/
-	u8 WLFineDelay[MAX_BYTE_LANES*MAX_LDIMMS];	/* Write Levelization Fine Delay */
-							/* per byte Lane Per Logical DIMM*/
+	uint8_t WLSeedPreGrossPrevNibble[MAX_BYTE_LANES*MAX_LDIMMS];
+	uint8_t WLSeedGrossPrevNibble[MAX_BYTE_LANES*MAX_LDIMMS];
+	uint8_t WLSeedFinePrevNibble[MAX_BYTE_LANES*MAX_LDIMMS];
+								/* per byte Lane Per Logical DIMM*/
+	u8 WLGrossDelay[MAX_BYTE_LANES*MAX_LDIMMS];		/* Write Levelization Gross Delay */
+								/* per byte Lane Per Logical DIMM*/
+	u8 WLFineDelay[MAX_BYTE_LANES*MAX_LDIMMS];		/* Write Levelization Fine Delay */
+								/* per byte Lane Per Logical DIMM*/
 	u8 WLGrossDelayFirstPass[MAX_BYTE_LANES*MAX_LDIMMS];	/* First-Pass Write Levelization Gross Delay */
 								/* per byte Lane Per Logical DIMM*/
 	u8 WLFineDelayFirstPass[MAX_BYTE_LANES*MAX_LDIMMS];	/* First-Pass Write Levelization Fine Delay */
-- 
1.7.9.5

